urllib

https://docs.python.org/3/library/urllib.html
urllib 是一个python内置的HTTP请求库,包括:

快速获取一个静态页面的HTML文档

1
2
3
4
5
6
7
import urllib.request

url = 'https://www.biquge.com.tw/18_18998/'
response =urllib.request.urlopen(url)
type(response) # <class 'http.client.HTTPResponse'>
states = response.getcode()
html =response.read().decode('gbk')

设置超时

1
2
3
4
5
6
import urllib.request

url = 'https://www.biquge.com.tw/18_18998/'
response =urllib.request.urlopen(url, timmeout=5)
states = response.getcode()
html =response.read().decode('gbk')

简单使用request

1
2
3
4
5
6
7
8
import urllib.request

url = 'https://www.biquge.com.tw/18_18998/'
########################################################
request = urllib.request.Request(url)
########################################################
response = urllib.request.urlopen(request)
html = response.read().decode('gbk')

添加请求头部信息

1
2
3
4
5
6
7
8
9
10
11
12
import urllib.request
import urllib.parse

url = 'https://www.biquge.com.tw/18_18998/'
########################################################
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36'
}
request = urllib.request.Request(url, headers=header)
########################################################
response = urllib.request.urlopen(request)
html = response.read()

GET数据

字符串拼接时无需转码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import urllib.request
import urllib.parse

url = 'http://www.baidu.com/s?'

header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
values = {
'wd':'中国'
}
data = urllib.parse.urlencode(values)

response = urllib.request.urlopen(url+data, timeout=10)
html = response.read()

POST数据

post数据必须转换成字节编码

1
2
3
4
5
6
7
8
9
10
11
import urllib.parse
import urllib.request

url = 'http://httpbin.org/post'
values = {
'world':'hello'
}
data = urllib.parse.urlencode(values).encode('utf8')

response = urllib.request.urlopen(url, data=data)
html = response.read()

捕获HTTP错误

1
2
3
4
5
6
7
8
import urllib.request

req = urllib.request.Request('http://www.2345.com/')
try:
urllib.request.urlopen(req)
except urllib.error.HTTPError as e:
print(e.code)
print(e.read().decode("utf8")

使用代理服务器

1
2
3
4
5
6
7
8
9
10
11
12
import urllib.request

url = 'https://www.biquge.com.tw/18_18998/'
########################################################
proxy_addr='182.61.59.147:9999'
proxy = urllib.request.ProxyHandler({'http':proxy_addr})
opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
########################################################
response =urllib.request.urlopen(url)
states = response.getcode()
html =response.read().decode('gbk')

使用cookie

爬取的网页涉及登录信息。访问每一个互联网页面,都是通过HTTP协议进行的,而HTTP协议是一个无状态协议,所谓的无状态协议即无法维持会话之间的状态

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 待修改
import urllib.request
import urllib.parse
import urllib.error
import http.cookiejar

url='http://bbs.chinaunix.net/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=La2A2'
data={
'username':'zhanghao',
'password':'mima',
}
postdata=urllib.parse.urlencode(data).encode('utf8')
header={
'User-Agent':'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

request=urllib.request.Request(url,postdata,headers=header)
#使用http.cookiejar.CookieJar()创建CookieJar对象
cjar=http.cookiejar.CookieJar()
#使用HTTPCookieProcessor创建cookie处理器,并以其为参数构建opener对象
cookie=urllib.request.HTTPCookieProcessor(cjar)
opener=urllib.request.build_opener(cookie)
#将opener安装为全局
urllib.request.install_opener(opener)

try:
reponse=urllib.request.urlopen(request)
except urllib.error.HTTPError as e:
print(e.code)
print(e.reason)

html = reponse.read()

url2='http://bbs.chinaunix.net/forum-327-1.html'
#打开url2,会发现此时会保持我们的登录信息,为已登录状态。也就是说,对应的登录状态已经通过Cookie保存。
reponse2=urllib.request.urlopen(url)
html2 = reponse2.read()